{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "5eb60008",
   "metadata": {},
   "source": [
    "# 生成式预训练语言模型：理论与实战\n",
    "深蓝学院 课程 \n",
    "课程链接：https://www.shenlanxueyuan.com/course/620\n",
    "\n",
    "作者 **黄佳**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "d3216379",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "词汇表： ['is', 'Mazong', 'Student', 'Niuzong', 'Xiaoxue', 'Kage', 'Xiaobing', 'Boss', 'Teacher']\n",
      "词汇到索引的字典： {'is': 0, 'Mazong': 1, 'Student': 2, 'Niuzong': 3, 'Xiaoxue': 4, 'Kage': 5, 'Xiaobing': 6, 'Boss': 7, 'Teacher': 8}\n",
      "索引到词汇的字典： {0: 'is', 1: 'Mazong', 2: 'Student', 3: 'Niuzong', 4: 'Xiaoxue', 5: 'Kage', 6: 'Xiaobing', 7: 'Boss', 8: 'Teacher'}\n",
      "词汇表大小： 9\n"
     ]
    }
   ],
   "source": [
    "# 定义一个句子列表，后面会用这些句子来训练CBOW和Skip-Gram模型\n",
    "sentences = [\"Kage is Teacher\", \"Mazong is Boss\", \"Niuzong is Boss\",\n",
    "             \"Xiaobing is Student\", \"Xiaoxue is Student\",]\n",
    "# 将所有句子连接在一起，然后用空格分隔成词汇\n",
    "words = ' '.join(sentences).split()\n",
    "# 构建词汇表，去除重复的词\n",
    "word_list = list(set(words))\n",
    "# 创建一个字典，将每个词汇映射到一个唯一的索引\n",
    "word_to_idx = {word: idx for idx, word in enumerate(word_list)}\n",
    "# 创建一个字典，将每个索引映射到对应的词汇\n",
    "idx_to_word = {idx: word for idx, word in enumerate(word_list)}\n",
    "voc_size = len(word_list) # 计算词汇表的大小\n",
    "print(\"词汇表：\", word_list) # 输出词汇表\n",
    "print(\"词汇到索引的字典：\", word_to_idx) # 输出词汇到索引的字典\n",
    "print(\"索引到词汇的字典：\", idx_to_word) # 输出索引到词汇的字典\n",
    "print(\"词汇表大小：\", voc_size) # 输出词汇表大小"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "de8f4ae7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Skip-Gram数据样例（未编码）： [('is', 'Kage'), ('Teacher', 'Kage'), ('Kage', 'is'), ('Teacher', 'is'), ('Kage', 'Teacher'), ('is', 'Teacher'), ('is', 'Mazong'), ('Boss', 'Mazong'), ('Mazong', 'is'), ('Boss', 'is'), ('Mazong', 'Boss'), ('is', 'Boss'), ('is', 'Niuzong'), ('Boss', 'Niuzong'), ('Niuzong', 'is'), ('Boss', 'is'), ('Niuzong', 'Boss'), ('is', 'Boss'), ('is', 'Xiaobing'), ('Student', 'Xiaobing'), ('Xiaobing', 'is'), ('Student', 'is'), ('Xiaobing', 'Student'), ('is', 'Student'), ('is', 'Xiaoxue'), ('Student', 'Xiaoxue'), ('Xiaoxue', 'is'), ('Student', 'is'), ('Xiaoxue', 'Student'), ('is', 'Student')]\n"
     ]
    }
   ],
   "source": [
    "# 生成Skip-Gram训练数据\n",
    "def create_skipgram_dataset(sentences, window_size=2):\n",
    "    data = []\n",
    "    for sentence in sentences:\n",
    "        sentence = sentence.split()  # 将句子分割成单词列表\n",
    "        for idx, word in enumerate(sentence):  # 遍历单词及其索引\n",
    "            # 获取相邻的单词，将当前单词前后各N个单词作为相邻单词\n",
    "            for neighbor in sentence[max(idx - window_size, 0): \n",
    "                        min(idx + window_size + 1, len(sentence))]:\n",
    "                if neighbor != word:  # 排除当前单词本身\n",
    "                    # 将相邻单词与当前单词作为一组训练数据\n",
    "                    data.append((neighbor, word))\n",
    "    return data\n",
    "# 使用函数创建Skip-Gram训练数据\n",
    "skipgram_data = create_skipgram_dataset(sentences)\n",
    "# 打印未编码的Skip-Gram数据样例（前三个）\n",
    "print(\"Skip-Gram数据样例（未编码）：\", skipgram_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "72158e33",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "One-Hot编码前的单词： Teacher\n",
      "One-Hot编码后的向量： tensor([0., 0., 0., 0., 0., 0., 0., 0., 1.])\n",
      "Skip-Gram数据样例（已编码）： [(tensor([1., 0., 0., 0., 0., 0., 0., 0., 0.]), 5), (tensor([0., 0., 0., 0., 0., 0., 0., 0., 1.]), 5), (tensor([0., 0., 0., 0., 0., 1., 0., 0., 0.]), 0)]\n"
     ]
    }
   ],
   "source": [
    "# 定义One-Hot编码函数\n",
    "import torch # 导入torch库\n",
    "def one_hot_encoding(word, word_to_idx):\n",
    "    # 创建一个全为0的张量，长度与词汇表大小相同\n",
    "    tensor = torch.zeros(len(word_to_idx))  \n",
    "    tensor[word_to_idx[word]] = 1  # 将对应词汇的索引位置置为1\n",
    "    return tensor  # 返回生成的One-Hot向量\n",
    "\n",
    "# 展示One-Hot编码前后的数据\n",
    "word_example = \"Teacher\"\n",
    "print(\"One-Hot编码前的单词：\", word_example)\n",
    "print(\"One-Hot编码后的向量：\", one_hot_encoding(word_example, word_to_idx))\n",
    "\n",
    "# 展示编码后的Skip-Gram数据样例\n",
    "print(\"Skip-Gram数据样例（已编码）：\", [(one_hot_encoding(context, word_to_idx), \n",
    "          word_to_idx[target]) for context, target in skipgram_data[:3]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "31ae5270",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Skip-Gram模型： SkipGram(\n",
      "  (input_to_hidden): Embedding(9, 2)\n",
      "  (hidden_to_output): Linear(in_features=2, out_features=9, bias=False)\n",
      ")\n"
     ]
    }
   ],
   "source": [
    "# 定义Skip-Gram模型\n",
    "import torch.nn as nn # 导入neural network\n",
    "class SkipGram(nn.Module):\n",
    "    def __init__(self, voc_size, embedding_size):\n",
    "        super(SkipGram, self).__init__()\n",
    "        self.input_to_hidden = nn.Embedding(voc_size, embedding_size)\n",
    "        self.hidden_to_output = nn.Linear(embedding_size, voc_size, bias=False)\n",
    "\n",
    "    def forward(self, X):\n",
    "        hidden_layer = self.input_to_hidden(X)  # 生成隐藏层：[batch_size, embedding_size]\n",
    "        output_layer = self.hidden_to_output(hidden_layer)  # 生成输出层：[batch_size, voc_size]\n",
    "        return output_layer\n",
    "   \n",
    "embedding_size = 2 # 设定嵌入层的大小，这里选择2是为了方便展示\n",
    "skipgram_model = SkipGram(voc_size,embedding_size)  # 实例化SkipGram模型\n",
    "print(\"Skip-Gram模型：\", skipgram_model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "311ccd16",
   "metadata": {},
   "outputs": [
    {
     "ename": "IndexError",
     "evalue": "Dimension out of range (expected to be in range of [-1, 0], but got 1)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mIndexError\u001b[0m                                Traceback (most recent call last)",
      "Input \u001b[0;32mIn [5]\u001b[0m, in \u001b[0;36m<cell line: 10>\u001b[0;34m()\u001b[0m\n\u001b[1;32m     15\u001b[0m y_true \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mtensor(word_to_idx[target], dtype\u001b[38;5;241m=\u001b[39mtorch\u001b[38;5;241m.\u001b[39mlong)  \u001b[38;5;66;03m# 目标词是索引值 \u001b[39;00m\n\u001b[1;32m     16\u001b[0m y_pred \u001b[38;5;241m=\u001b[39m skipgram_model(X)  \u001b[38;5;66;03m# 计算预测值\u001b[39;00m\n\u001b[0;32m---> 17\u001b[0m loss \u001b[38;5;241m=\u001b[39m \u001b[43mcriterion\u001b[49m\u001b[43m(\u001b[49m\u001b[43my_pred\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43my_true\u001b[49m\u001b[43m)\u001b[49m  \u001b[38;5;66;03m# 计算损失\u001b[39;00m\n\u001b[1;32m     18\u001b[0m loss_sum \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m loss\u001b[38;5;241m.\u001b[39mitem() \u001b[38;5;66;03m# 累积损失\u001b[39;00m\n\u001b[1;32m     19\u001b[0m optimizer\u001b[38;5;241m.\u001b[39mzero_grad()  \u001b[38;5;66;03m# 清空梯度\u001b[39;00m\n",
      "File \u001b[0;32m~/ENTER/lib/python3.9/site-packages/torch/nn/modules/module.py:1102\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *input, **kwargs)\u001b[0m\n\u001b[1;32m   1098\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1099\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1100\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1101\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1102\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1103\u001b[0m \u001b[38;5;66;03m# Do not call functions when jit is used\u001b[39;00m\n\u001b[1;32m   1104\u001b[0m full_backward_hooks, non_full_backward_hooks \u001b[38;5;241m=\u001b[39m [], []\n",
      "File \u001b[0;32m~/ENTER/lib/python3.9/site-packages/torch/nn/modules/loss.py:1150\u001b[0m, in \u001b[0;36mCrossEntropyLoss.forward\u001b[0;34m(self, input, target)\u001b[0m\n\u001b[1;32m   1149\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor, target: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[0;32m-> 1150\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcross_entropy\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1151\u001b[0m \u001b[43m                           \u001b[49m\u001b[43mignore_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mignore_index\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mreduction\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mreduction\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1152\u001b[0m \u001b[43m                           \u001b[49m\u001b[43mlabel_smoothing\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlabel_smoothing\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/ENTER/lib/python3.9/site-packages/torch/nn/functional.py:2846\u001b[0m, in \u001b[0;36mcross_entropy\u001b[0;34m(input, target, weight, size_average, ignore_index, reduce, reduction, label_smoothing)\u001b[0m\n\u001b[1;32m   2844\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m size_average \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m reduce \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m   2845\u001b[0m     reduction \u001b[38;5;241m=\u001b[39m _Reduction\u001b[38;5;241m.\u001b[39mlegacy_get_string(size_average, reduce)\n\u001b[0;32m-> 2846\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_C\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_nn\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcross_entropy_loss\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtarget\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43m_Reduction\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_enum\u001b[49m\u001b[43m(\u001b[49m\u001b[43mreduction\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mignore_index\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mlabel_smoothing\u001b[49m\u001b[43m)\u001b[49m\n",
      "\u001b[0;31mIndexError\u001b[0m: Dimension out of range (expected to be in range of [-1, 0], but got 1)"
     ]
    }
   ],
   "source": [
    "# 训练Skip-Gram模型\n",
    "learning_rate = 0.001 # 设置学习速率\n",
    "epochs = 1000 # 设置训练轮次\n",
    "criterion = nn.CrossEntropyLoss()  # 定义交叉熵损失函数\n",
    "import torch.optim as optim # 导入随机梯度下降优化器\n",
    "optimizer = optim.SGD(skipgram_model.parameters(), lr=learning_rate)  \n",
    "\n",
    "# 开始训练循环\n",
    "loss_values = []  # 用于存储每轮的平均损失值\n",
    "for epoch in range(epochs):\n",
    "    loss_sum = 0 # 初始化损失值\n",
    "    for context, target in skipgram_data:\n",
    "        # 将周围词转换为One-Hot向量\n",
    "        X = torch.tensor(word_to_idx[context], dtype=torch.long)  # 将周围词的索引值转为LongTensor\n",
    "        y_true = torch.tensor(word_to_idx[target], dtype=torch.long)  # 目标词是索引值 \n",
    "        y_pred = skipgram_model(X)  # 计算预测值\n",
    "        loss = criterion(y_pred, y_true)  # 计算损失\n",
    "        loss_sum += loss.item() # 累积损失\n",
    "        optimizer.zero_grad()  # 清空梯度\n",
    "        loss.backward()  # 反向传播\n",
    "        optimizer.step()  # 更新参数\n",
    "    if (epoch+1) % 100 == 0: # 输出每100轮的损失，并记录损失\n",
    "        print(f\"Epoch: {epoch+1}, Loss: {loss_sum/len(skipgram_data)}\")  \n",
    "        loss_values.append(loss_sum / len(skipgram_data))\n",
    "\n",
    "# 绘制训练损失曲线\n",
    "import matplotlib.pyplot as plt # 导入matplotlib\n",
    "plt.plot(range(1, epochs//100 + 1), loss_values) # 绘图\n",
    "plt.title('Training Loss') # 图题\n",
    "plt.xlabel('Epochs') # X轴Label\n",
    "plt.ylabel('Loss') # Y轴Label\n",
    "plt.show() # 显示图"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "75e56ed1",
   "metadata": {},
   "outputs": [],
   "source": [
    "# 输出Skip-Gram习得的词嵌入\n",
    "print(\"\\nSkip-Gram词嵌入:\")\n",
    "for word, idx in word_to_idx.items(): # 输出每个单词的嵌入向量\n",
    "    print(f\"{word}: \\\n",
    "    {skipgram_model.input_to_hidden.weight[idx].detach().numpy()}\")  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a1670078",
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.rcParams[\"font.family\"]=['SimHei'] # 用来设定字体样式\n",
    "plt.rcParams['font.sans-serif']=['SimHei'] # 用来设定无衬线字体样式\n",
    "plt.rcParams['axes.unicode_minus']=False # 用来正常显示负号\n",
    "# 绘制二维词向量图\n",
    "fig, ax = plt.subplots() \n",
    "for word, idx in word_to_idx.items():\n",
    "    vec = skipgram_model.input_to_hidden.weight[\\\n",
    "            idx].detach().numpy() # 获取每个单词的嵌入向量\n",
    "    ax.scatter(vec[0], vec[1]) # 在图中绘制嵌入向量的点\n",
    "    ax.annotate(word, (vec[0], vec[1]), fontsize=12) # 点旁添加单词标签\n",
    "plt.title('2维词嵌入') # 图题\n",
    "plt.xlabel('向量维度1') # X轴Label\n",
    "plt.ylabel('向量维度2') # Y轴Label\n",
    "plt.show() # 显示图"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "27c673b0",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
